{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['data1.txt', 'data2.txt', 'wiki_cleaned.txt']\n"
     ]
    }
   ],
   "source": [
    "# 打开data文件夹并列出所有文件\n",
    "import os\n",
    "\n",
    "data_dir = 'data'\n",
    "file_paths = os.listdir(data_dir)\n",
    "print(file_paths)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from tokenizers import Tokenizer, models, trainers, pre_tokenizers\n",
    "\n",
    "# 1. 创建 BPE Tokenizer\n",
    "tokenizer = Tokenizer(models.BPE())\n",
    "# 2. 训练 Tokenizer\n",
    "trainer = trainers.BpeTrainer(special_tokens=[\"<pad>\", \"<bos>\", \"<eos>\"], vocab_size=10000)\n",
    "tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()\n",
    "file_paths = os.listdir('data')\n",
    "tokenizer.train(['wiki_cleaned.txt'], trainer)\n",
    "\n",
    "# 3. 保存 Tokenizer\n",
    "tokenizer.save(\"bpe_tokenizer.json\")\n",
    "\n",
    "# 4. 测试 Tokenizer\n",
    "encoded = tokenizer.encode(\"hello world, how are you?\")\n",
    "print(\"编码:\", encoded.tokens)\n",
    "print(\"解码:\", tokenizer.decode(encoded.ids))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "使用设备: cpu\n"
     ]
    }
   ],
   "source": [
    "from tokenizers import Tokenizer, models, trainers, pre_tokenizers\n",
    "\n",
    "tokenizer = Tokenizer(models.BPE())\n",
    "\n",
    "# 使用多线程加速 BPE 训练\n",
    "trainer = trainers.BpeTrainer(vocab_size=50000, special_tokens=[\"<pad>\", \"<bos>\", \"<eos>\"])\n",
    "tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()\n",
    "file_paths = ['wiki_cleaned.txt']\n",
    "\n",
    "# 训练时开启多线程\n",
    "tokenizer.train('wiki_cleaned.txt', trainer, num_threads=8)  # 8 代表线程数，根据你的 CPU 调整\n",
    "# 3. 保存 Tokenizer\n",
    "tokenizer.save(\"bpe_tokenizer.json\")\n",
    "\n",
    "# 4. 测试 Tokenizer\n",
    "encoded = tokenizer.encode(\"hello world, how are you?\")\n",
    "print(\"编码:\", encoded.tokens)\n",
    "print(\"解码:\", tokenizer.decode(encoded.ids))\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
